// CuNNy fast - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.


//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-0003
//!USE MulAdd
//!CAPABILITY FP16

#include "../StubDefs.hlsli"

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, x, y) t.SampleLevel(SP, pos + float2(x, y) * pt, 0)
#define V4 MF4
#define M4 MF4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T4;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_UNORM
Texture2D T5;

//!PASS 1
//!DESC in (1x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT T0, T1, T2

#define L0(x, y) MF(dot(MF3(0.299, 0.587, 0.114), O(INPUT, x, y).rgb))

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	MF s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	r0 = V4(-7.280e-05, 1.400e-04, -1.296e-04, 4.014e-04);
	r1 = V4(-1.023e-03, 2.540e-04, 1.793e-03, -4.770e-03);
	r2 = V4(-5.813e-04, 2.326e-02, 2.179e-02, 3.161e-03);
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	r0 = mad(s0_0_0, V4(3.974e-03, -4.994e-02, -1.106e-01, 7.984e-02), r0);
	r1 = mad(s0_0_0, V4(-1.146e-02, 2.714e-02, -1.208e-01, -5.811e-03), r1);
	r2 = mad(s0_0_0, V4(2.306e-02, -1.984e-02, -1.205e-02, -5.797e-03), r2);
	r0 = mad(s0_0_1, V4(8.621e-01, 4.326e-01, 5.918e-01, -1.650e-01), r0);
	r1 = mad(s0_0_1, V4(5.566e-03, -1.528e-01, 1.548e-01, -3.752e-01), r1);
	r2 = mad(s0_0_1, V4(-2.113e-02, -2.094e-04, 6.522e-02, -5.227e-02), r2);
	r0 = mad(s0_0_2, V4(1.085e-02, -1.058e-01, -7.681e-03, -8.618e-02), r0);
	r1 = mad(s0_0_2, V4(2.526e-03, -1.193e-01, 1.655e-01, 2.262e-02), r1);
	r2 = mad(s0_0_2, V4(4.877e-03, 8.765e-03, 9.976e-03, 3.630e-02), r2);
	r0 = mad(s0_1_0, V4(-6.514e-03, -1.627e-02, 9.897e-02, 3.289e-02), r0);
	r1 = mad(s0_1_0, V4(7.988e-01, 3.223e-02, 3.274e-02, -2.993e-01), r1);
	r2 = mad(s0_1_0, V4(2.822e-03, -4.395e-03, 3.037e-01, -2.012e-01), r2);
	r0 = mad(s0_1_1, V4(-8.652e-01, -1.159e-02, 8.169e-02, 4.252e-01), r0);
	r1 = mad(s0_1_1, V4(-7.781e-01, 4.338e-01, 2.879e-01, 7.549e-01), r1);
	r2 = mad(s0_1_1, V4(6.310e-01, -1.011e-01, -2.446e-01, 5.293e-01), r2);
	r0 = mad(s0_1_2, V4(-5.410e-03, -9.159e-02, -6.371e-01, -6.387e-01), r0);
	r1 = mad(s0_1_2, V4(-1.319e-02, -1.619e-01, 2.345e-01, -8.862e-02), r1);
	r2 = mad(s0_1_2, V4(-1.833e-02, 3.118e-02, 2.168e-02, -4.513e-02), r2);
	r0 = mad(s0_2_0, V4(5.906e-05, -6.218e-03, 2.289e-02, -1.228e-01), r0);
	r1 = mad(s0_2_0, V4(-1.375e-02, 1.999e-02, 9.690e-02, 1.443e-02), r1);
	r2 = mad(s0_2_0, V4(2.581e-02, -1.284e-01, 3.156e-02, -6.670e-02), r2);
	r0 = mad(s0_2_1, V4(2.334e-03, -4.197e-03, -1.160e-01, 3.487e-01), r0);
	r1 = mad(s0_2_1, V4(2.869e-03, -2.522e-02, -4.490e-01, 8.423e-02), r1);
	r2 = mad(s0_2_1, V4(-5.647e-03, 3.777e-01, -1.097e-01, -1.901e-01), r2);
	r0 = mad(s0_2_2, V4(-4.110e-03, 1.439e-03, 7.400e-02, 1.250e-01), r0);
	r1 = mad(s0_2_2, V4(6.177e-03, 1.872e-02, -4.033e-01, -1.047e-01), r1);
	r2 = mad(s0_2_2, V4(1.412e-02, 1.066e-02, -2.194e-02, 2.292e-02), r2);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
	r2 = max(r2, 0.0);
	T2[gxy] = r2;
}

//!PASS 2
//!DESC conv1 (12x12)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T0, T1, T2
//!OUT T3, T4, T5

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))
#define L2(x, y) V4(O(T2, x, y))

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0, r2 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-1.195e-01, -2.782e-03, -1.647e-02, 8.935e-02, 1.342e-01, -4.409e-02, -2.694e-02, -1.557e-01, -4.474e-01, 5.639e-02, 1.080e-01, -1.304e-01, -2.758e-01, 7.983e-02, -1.059e-02, -6.616e-02), r0);
	r1 = MulAdd(s0_0_0, M4(1.783e-01, 9.807e-02, -2.627e-02, -2.885e-02, 2.384e-01, -2.243e-01, -7.838e-03, 6.787e-02, -2.319e-01, 2.666e-01, 1.008e-01, -1.147e-01, 9.865e-02, 1.294e-01, 4.350e-02, 1.246e-01), r1);
	r2 = MulAdd(s0_0_0, M4(-1.619e-02, -3.094e-02, -4.934e-02, -1.328e-02, -4.038e-02, 2.685e-02, -2.006e-01, -1.888e-02, -3.766e-02, 3.719e-02, 8.466e-02, 6.166e-03, -9.328e-02, -2.239e-01, -7.447e-02, 1.739e-02), r2);
	r0 = MulAdd(s0_0_1, M4(-5.621e-01, -1.996e-02, 8.765e-02, -3.496e-02, 4.525e-01, -7.246e-02, -2.052e-01, -1.602e-01, -2.078e-01, 6.393e-02, 2.912e-02, 7.006e-02, 4.341e-01, 1.246e-01, 2.096e-02, 6.617e-02), r0);
	r1 = MulAdd(s0_0_1, M4(2.314e-02, -3.825e-02, 2.103e-02, 4.088e-02, 3.772e-01, -2.136e-01, -9.008e-02, -1.062e-01, -9.125e-03, 7.250e-02, 2.888e-02, -1.003e-01, 1.971e-02, -3.385e-01, -5.944e-04, 2.791e-01), r1);
	r2 = MulAdd(s0_0_1, M4(8.260e-03, -1.815e-01, 1.043e-01, -6.471e-03, -2.324e-01, 3.654e-02, -3.034e-01, -4.956e-02, 6.765e-02, 9.634e-02, 6.750e-02, 2.752e-02, 2.886e-02, -1.977e-01, 6.311e-01, 2.736e-02), r2);
	r0 = MulAdd(s0_0_2, M4(-1.072e-01, -7.057e-02, 1.890e-02, 1.577e-01, 2.271e-01, 1.453e-01, -8.716e-02, -2.320e-01, -1.048e-01, 1.195e-02, -6.811e-03, 2.474e-02, -3.527e-02, -1.130e-01, -1.529e-02, -8.423e-02), r0);
	r1 = MulAdd(s0_0_2, M4(-8.031e-02, -9.505e-02, -4.529e-02, 9.090e-02, 1.433e-01, -1.259e-01, 7.499e-02, 7.982e-02, 3.114e-03, 4.358e-02, -4.961e-02, -1.052e-01, -3.700e-02, -1.449e-01, -1.125e-01, 1.505e-01), r1);
	r2 = MulAdd(s0_0_2, M4(-3.306e-02, -4.710e-03, 1.636e-01, 4.672e-02, 1.131e-01, 2.906e-02, -6.813e-02, -7.225e-03, -2.967e-02, -1.007e-01, 2.268e-02, -1.861e-02, -6.798e-02, 5.887e-02, 2.688e-02, -1.170e-05), r2);
	r0 = MulAdd(s0_1_0, M4(6.728e-02, 3.510e-02, 2.658e-02, 2.512e-01, -3.088e-01, -1.993e-01, 2.953e-01, -2.033e-01, 2.062e-01, 1.665e-02, 1.870e-01, -2.065e-01, -3.763e-01, -2.050e-01, -1.891e-02, -5.449e-01), r0);
	r1 = MulAdd(s0_1_0, M4(3.610e-01, 5.925e-01, 2.751e-02, -2.362e-02, 6.179e-01, -2.047e-01, -7.985e-02, -3.896e-01, 2.502e-02, 5.250e-01, -7.937e-03, -3.707e-01, -4.750e-01, -1.274e-01, 1.043e-01, 6.493e-01), r1);
	r2 = MulAdd(s0_1_0, M4(-8.045e-02, -4.278e-01, -3.148e-01, 1.371e-02, 4.091e-02, 7.419e-02, -1.184e-01, -8.461e-02, 8.302e-03, 3.290e-02, -1.579e-01, 1.514e-01, -3.994e-02, 1.810e-01, -2.520e-01, 7.419e-02), r2);
	r0 = MulAdd(s0_1_1, M4(3.134e-01, 3.896e-01, 7.740e-01, 7.747e-01, 3.455e-02, 1.452e-01, 2.454e-02, 5.856e-02, -8.463e-02, 2.719e-02, 3.138e-02, 1.391e-01, 1.401e-01, 2.224e-02, 5.334e-02, -1.340e-01), r0);
	r1 = MulAdd(s0_1_1, M4(2.838e-01, 5.138e-01, 3.291e-01, 3.940e-01, 1.003e-01, 1.362e-01, 1.313e-01, 9.639e-02, 2.783e-01, -5.034e-01, -6.909e-02, -3.842e-01, 1.153e-01, 2.969e-01, -4.680e-02, 8.920e-01), r1);
	r2 = MulAdd(s0_1_1, M4(3.780e-01, -4.150e-01, 5.874e-01, 4.346e-01, -5.301e-03, -6.339e-01, 1.921e-01, -1.519e-01, -5.744e-02, 2.906e-02, 4.541e-01, -1.083e-01, 3.193e-01, -1.175e-01, -1.372e-01, 3.318e-01), r2);
	r0 = MulAdd(s0_1_2, M4(1.232e-01, -3.721e-01, 9.193e-02, 1.525e-01, 5.252e-01, 2.155e-01, 1.284e-01, -2.804e-01, -1.023e-01, -2.272e-01, -2.110e-02, -6.227e-03, -1.946e-02, -9.592e-02, -5.048e-02, -2.675e-02), r0);
	r1 = MulAdd(s0_1_2, M4(1.930e-01, -8.656e-01, -2.994e-01, -2.104e-01, 1.890e-01, 4.542e-01, 2.574e-02, -1.720e-01, -1.645e-01, -3.407e-01, -5.756e-02, -1.956e-01, 1.388e-01, -1.445e-01, -6.649e-02, 2.406e-01), r1);
	r2 = MulAdd(s0_1_2, M4(-1.713e-01, -1.264e-01, 2.701e-01, 2.557e-02, 1.577e-01, 1.241e-01, 1.909e-01, 1.067e-01, -2.781e-02, 1.250e-01, 8.625e-02, -3.212e-02, -4.053e-02, -2.230e-02, 2.453e-02, -7.918e-03), r2);
	r0 = MulAdd(s0_2_0, M4(4.534e-02, -4.142e-02, 8.179e-02, -2.684e-01, -4.161e-02, -8.607e-02, -5.479e-02, 1.754e-01, -8.713e-02, -1.797e-02, 4.359e-02, -1.742e-01, -2.398e-02, 5.235e-02, 5.225e-02, 1.756e-01), r0);
	r1 = MulAdd(s0_2_0, M4(-1.164e-01, 2.328e-01, 2.310e-01, 1.101e-01, 2.458e-02, -8.798e-01, -2.802e-01, 5.107e-02, -1.197e+00, 2.584e-01, 3.634e-02, -4.719e-01, 6.120e-01, 2.479e-02, 1.159e-01, 4.846e-01), r1);
	r2 = MulAdd(s0_2_0, M4(5.291e-02, 1.612e-01, -2.273e-01, 1.439e-01, 4.775e-01, 1.082e-02, -3.568e-02, 1.408e-01, 1.625e-01, -1.131e-01, -7.561e-02, 1.519e-01, -6.958e-03, -2.370e-01, 1.773e-02, -1.732e-02), r2);
	r0 = MulAdd(s0_2_1, M4(4.541e-01, 2.469e-01, -1.168e-02, -3.170e-01, -1.101e+00, -7.643e-01, -2.874e-01, 2.968e-02, 1.191e-02, 1.177e-01, 1.100e-03, -8.066e-01, 6.307e-02, 1.023e-01, 2.403e-02, 2.938e-01), r0);
	r1 = MulAdd(s0_2_1, M4(4.083e-01, 2.414e-01, 1.567e-02, 5.885e-01, -1.929e+00, 2.075e-01, 4.123e-02, 2.607e-01, 2.411e-01, 3.928e-02, -8.402e-02, -4.086e-01, 1.313e-01, -6.640e-03, 1.242e-01, 4.443e-01), r1);
	r2 = MulAdd(s0_2_1, M4(1.621e+00, 2.404e-01, -3.576e-01, 1.405e+00, -8.027e-01, 7.398e-01, 6.259e-01, -3.249e-01, 3.969e-01, -9.999e-02, 2.323e-01, 2.189e-01, 1.142e-01, -1.526e-01, 1.631e-01, 7.761e-02), r2);
	r0 = MulAdd(s0_2_2, M4(-1.086e-02, -6.350e-02, 9.068e-03, -2.549e-01, -2.245e-01, 7.344e-01, 3.321e-02, 3.097e-01, 2.939e-02, -2.476e-01, 1.290e-02, 5.514e-02, 2.033e-02, 1.535e-02, 3.096e-02, -1.111e-01), r0);
	r1 = MulAdd(s0_2_2, M4(5.527e-01, -4.384e-01, -9.156e-02, 3.779e-01, -8.049e-02, 1.777e-01, 1.279e-01, 5.167e-01, 8.615e-02, 3.818e-02, -8.714e-02, -4.111e-01, -9.676e-03, 1.170e-01, -5.134e-02, 4.131e-01), r1);
	r2 = MulAdd(s0_2_2, M4(9.804e-02, 3.701e-01, 7.796e-02, 1.041e-01, 3.967e-01, -3.729e-02, -3.651e-01, 2.187e-01, -1.155e-01, 2.961e-02, 4.329e-02, -9.155e-02, -6.378e-02, -1.005e-01, -8.361e-04, -2.016e-02), r2);
	r0 = MulAdd(s1_0_0, M4(-8.091e-02, 5.964e-03, 1.131e-02, -1.045e-01, 4.463e-01, -3.057e-02, -3.990e-02, -1.284e-01, 7.705e-02, -1.342e-01, -1.647e-01, -1.505e-01, -1.435e-02, -4.786e-02, -1.364e-02, 7.105e-02), r0);
	r1 = MulAdd(s1_0_0, M4(5.244e-01, 1.761e-01, 9.298e-02, -2.485e-02, -4.892e-01, -4.094e-01, -1.527e-01, 5.510e-02, -2.399e-01, -5.568e-01, -2.523e-01, 4.273e-02, -2.009e-02, -1.415e-01, 9.470e-03, -2.935e-02), r1);
	r2 = MulAdd(s1_0_0, M4(-1.994e-02, 4.133e-02, -7.480e-02, 1.158e-02, 5.948e-02, 1.075e-01, 4.158e-02, -7.777e-03, 7.635e-02, 1.919e-01, 2.498e-01, -6.321e-02, 3.255e-03, 5.260e-02, -1.264e-02, 1.136e-02), r2);
	r0 = MulAdd(s1_0_1, M4(-1.161e+00, -1.801e-03, 2.903e-01, 8.344e-02, 9.512e-02, 4.877e-02, 2.932e-02, -3.623e-01, -3.902e-01, -2.259e-02, 2.860e-04, -3.074e-01, -3.301e-03, -5.011e-02, -1.529e-01, 1.839e-01), r0);
	r1 = MulAdd(s1_0_1, M4(4.421e-01, 1.424e-02, 2.052e-01, 1.260e-01, -4.766e-01, 2.058e-01, -1.150e-01, 1.372e-01, -6.994e-02, 4.053e-01, 1.377e-01, -5.016e-02, 3.171e-01, -1.886e-02, 2.080e-02, -1.534e-01), r1);
	r2 = MulAdd(s1_0_1, M4(7.398e-02, -1.675e-01, 2.317e-02, 4.728e-03, 1.186e-01, 4.680e-01, 8.349e-02, 9.532e-02, -5.383e-02, 6.255e-02, -5.918e-01, -5.407e-02, -7.339e-02, 1.084e-01, -4.561e-01, -7.709e-02), r2);
	r0 = MulAdd(s1_0_2, M4(-5.955e-01, -8.572e-02, 5.788e-02, -2.494e-02, -2.346e-02, 5.889e-02, 1.739e-02, 1.601e-01, -1.611e-02, 7.011e-02, 1.486e-02, 3.629e-03, 2.366e-02, 1.401e-01, 1.545e-02, -6.272e-02), r0);
	r1 = MulAdd(s1_0_2, M4(-3.560e-01, -1.456e-01, -2.211e-01, -1.439e-01, 3.024e-01, 2.056e-01, 1.674e-01, 4.816e-02, 8.819e-02, 2.785e-01, 1.116e-01, 4.942e-02, 7.848e-02, 2.130e-01, 6.364e-02, -2.354e-01), r1);
	r2 = MulAdd(s1_0_2, M4(-6.028e-02, -2.269e-01, -2.095e-01, 8.257e-03, -3.370e-02, 1.719e-01, -1.185e-01, 2.671e-02, 1.316e-02, -6.564e-02, -5.600e-02, 2.305e-03, 2.600e-02, -2.821e-01, -1.426e-02, -7.375e-02), r2);
	r0 = MulAdd(s1_1_0, M4(-2.248e-02, 6.043e-02, 2.808e-02, 2.329e-01, 1.073e-01, -1.018e-01, -4.755e-02, -2.192e-02, 3.759e-02, -1.135e-01, -1.280e-01, 1.908e-01, 2.422e-02, -6.803e-02, 9.674e-04, 1.846e-01), r0);
	r1 = MulAdd(s1_1_0, M4(-2.803e-01, 4.876e-01, 2.270e-02, -3.722e-03, -1.658e-01, -3.292e-01, -6.100e-02, -1.270e-01, -1.044e+00, -3.930e-01, -2.939e-01, -2.094e-01, -2.124e-01, -2.370e-03, 1.091e-01, -1.311e-01), r1);
	r2 = MulAdd(s1_1_0, M4(-4.534e-03, -1.763e-01, -8.423e-02, 2.711e-02, -5.319e-03, 4.972e-01, 3.259e-02, -1.313e-01, -1.667e-01, 1.638e-01, 3.195e-01, -2.000e-01, -6.090e-02, 1.472e-01, -2.107e-02, -2.024e-02), r2);
	r0 = MulAdd(s1_1_1, M4(6.284e-01, 4.037e-01, 5.488e-01, 4.483e-01, 2.585e-01, 4.176e-02, 4.315e-02, 7.433e-01, -2.203e-01, -7.788e-02, 1.316e-03, 1.170e-01, -5.041e-01, -5.083e-01, -1.636e-01, 2.094e-01), r0);
	r1 = MulAdd(s1_1_1, M4(3.974e-01, 7.238e-01, 5.801e-01, 2.100e-01, -5.379e-01, -3.604e-02, -4.114e-01, -2.587e-01, -6.799e-01, 4.307e-01, 1.028e-01, -4.541e-01, -2.498e-01, -1.995e-01, -5.944e-03, -5.798e-01), r1);
	r2 = MulAdd(s1_1_1, M4(8.748e-01, -2.503e-01, -1.163e-01, 6.193e-01, -7.160e-02, -2.926e-01, 2.823e-01, -5.832e-02, -4.691e-01, -3.428e-01, -3.361e-01, -3.252e-01, -5.215e-01, -4.780e-01, -3.895e-01, -5.131e-01), r2);
	r0 = MulAdd(s1_1_2, M4(2.491e-01, -3.704e+01, 1.015e-01, -1.715e+00, 7.478e-04, 2.296e-01, 3.992e-02, 3.427e-01, -7.711e-03, 1.593e-02, 2.253e-02, -1.188e-01, 6.016e-02, 6.191e-01, 3.195e-03, 5.925e-02), r0);
	r1 = MulAdd(s1_1_2, M4(3.271e-01, -1.288e+00, -3.803e-01, -8.543e-02, -5.573e-01, 4.268e-01, 4.147e-01, 1.555e-01, -4.686e-02, 1.956e-01, 1.947e-02, 8.344e-02, -1.150e-01, -2.216e-02, 4.597e-02, -5.005e-02), r1);
	r2 = MulAdd(s1_1_2, M4(-5.889e-02, 7.753e-02, 9.189e-02, -7.154e-02, 4.701e-02, -3.662e-01, -2.010e-01, 3.158e-02, 6.865e-03, 5.643e-02, -6.771e-02, 3.890e-02, 3.126e-01, 1.334e-01, 1.128e-01, 1.313e-01), r2);
	r0 = MulAdd(s1_2_0, M4(-2.087e-02, 1.292e-02, 1.714e-02, -1.420e-01, 3.486e-01, -6.191e-02, -5.583e-02, -9.830e-02, 1.524e-02, -2.447e-02, -4.297e-02, -3.177e-01, 5.009e-02, 3.690e-02, 1.104e-02, 1.465e-02), r0);
	r1 = MulAdd(s1_2_0, M4(-3.224e-01, 1.184e-01, 9.703e-02, 7.334e-02, 1.129e+00, -1.284e-01, -8.011e-02, 2.494e-01, -6.895e-01, -1.548e-01, -7.984e-02, -1.451e-01, -7.972e-02, 5.407e-02, 4.626e-02, -4.652e-01), r1);
	r2 = MulAdd(s1_2_0, M4(1.332e-02, 7.325e-02, -6.152e-02, 3.228e-02, -1.453e-01, -1.606e-01, 4.684e-03, -5.536e-02, -2.218e-02, 6.068e-02, 4.577e-02, -4.796e-02, -1.470e-01, -6.950e-02, -7.434e-02, -1.102e-01), r2);
	r0 = MulAdd(s1_2_1, M4(-5.598e-02, 1.295e-01, 1.021e-03, -3.887e-01, -3.907e-01, 2.177e-01, 1.152e-02, 1.330e-01, 2.220e-02, 1.094e-02, 1.356e-02, -1.842e-01, -2.615e-02, -4.044e-02, 5.777e-03, -3.430e-01), r0);
	r1 = MulAdd(s1_2_1, M4(-1.770e+00, 6.587e-02, 4.620e-02, -4.802e-02, -1.365e-01, 4.538e-01, 1.483e-01, -3.157e-01, -1.712e-01, 4.919e-02, -2.451e-02, -1.657e-01, -5.841e-01, -2.260e-01, 3.821e-02, -6.152e-01), r1);
	r2 = MulAdd(s1_2_1, M4(2.949e-01, 1.007e-01, -2.504e-01, 2.965e-01, 1.692e-01, -4.431e-01, -3.032e-01, 8.727e-02, -7.648e-02, 5.728e-02, -1.088e-01, -5.211e-02, -2.787e-01, 2.133e-01, 3.779e-01, -1.804e-01), r2);
	r0 = MulAdd(s1_2_2, M4(-8.538e-02, -9.626e-02, -3.871e-02, -3.720e-01, -2.251e-01, -1.783e-01, -1.719e-02, -1.389e-01, -5.509e-02, 7.782e-03, -2.470e-02, -4.052e-02, 3.227e-01, -1.139e-01, -1.620e-02, 1.942e-01), r0);
	r1 = MulAdd(s1_2_2, M4(-6.348e-01, -8.457e-02, -8.323e-02, -3.917e-03, 5.205e-02, -3.622e-02, 1.685e-01, -4.062e-01, -1.141e-01, 1.716e-02, 3.226e-02, -1.704e-01, 7.662e-02, -3.979e-01, -1.098e-01, -4.929e-01), r1);
	r2 = MulAdd(s1_2_2, M4(1.495e-03, -4.435e-02, -9.707e-02, 3.295e-02, -8.101e-02, 5.559e-02, 1.508e-01, 4.317e-03, 2.925e-02, 9.780e-03, 1.053e-02, 1.849e-02, 3.227e-02, 1.697e-01, 5.290e-02, 2.406e-02), r2);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-1.288e-01, -1.476e-01, -1.194e-01, 1.757e-01, -9.781e-02, 1.083e-01, 3.316e-02, -2.542e-01, 3.555e-01, 5.492e-02, -5.015e-02, 7.384e-02, 3.053e-01, 1.326e-01, -1.065e-01, 4.714e-02), r0);
	r1 = MulAdd(s0_0_0, M4(-2.339e-01, -2.341e-01, -1.656e-01, -2.802e-02, -2.977e-02, 3.076e-01, 9.083e-03, 1.685e-01, -1.178e+00, -4.871e-01, -2.761e-01, -1.368e-01, -5.744e-01, -1.064e-02, 1.435e-01, 3.032e-01), r1);
	r2 = MulAdd(s0_0_0, M4(-8.233e-02, 1.596e-01, 3.061e-01, -2.245e-02, -1.757e-01, -5.143e-02, 1.061e-01, 5.894e-02, 7.105e-02, -6.280e-02, 1.684e-01, -2.964e-02, 2.744e-02, -1.831e-01, 3.190e-02, -1.195e-02), r2);
	r0 = MulAdd(s0_0_1, M4(3.771e-01, -3.831e-01, 6.527e-02, 4.982e-01, -3.543e-01, -4.346e-01, 1.272e-01, -5.401e-02, -3.638e-01, 3.155e-01, -1.419e-02, 2.188e-01, 2.790e-01, -2.027e-01, 5.311e-02, -2.886e-01), r0);
	r1 = MulAdd(s0_0_1, M4(-4.946e-01, 1.635e-01, 1.824e-01, 1.272e-01, -1.880e-01, 3.846e-01, 1.452e-01, -9.590e-01, -1.433e+00, 3.369e-01, 3.227e-01, 1.594e-01, -5.894e-01, -3.781e-01, -2.127e-01, 2.710e-03), r1);
	r2 = MulAdd(s0_0_1, M4(-3.087e-02, 4.912e-01, -8.260e-01, -1.196e-01, -3.771e-01, -4.267e-01, -2.383e-01, -1.185e-01, 1.409e-01, 8.061e-01, -1.003e-01, -9.949e-03, -5.593e-02, 3.181e-01, -1.151e-01, -5.267e-02), r2);
	r0 = MulAdd(s0_0_2, M4(2.705e-01, 5.424e-01, -6.658e-02, 2.918e-01, -2.207e-01, 1.130e-01, -5.503e-02, -1.384e-01, 8.320e-01, 2.518e-01, -7.916e-03, 2.687e-02, -3.833e-01, -4.623e-02, 3.314e-03, -1.942e-01), r0);
	r1 = MulAdd(s0_0_2, M4(-4.029e-01, -2.731e-02, -3.362e-02, 3.361e-01, 3.759e-01, -4.710e-01, 1.406e-01, -2.484e-01, 6.383e-01, 2.517e-01, 1.400e-02, 5.596e-02, 2.565e-01, -1.129e-01, 8.728e-02, 4.135e-01), r1);
	r2 = MulAdd(s0_0_2, M4(2.970e-01, 5.334e-02, 2.566e-01, 1.008e-01, -6.914e-02, -8.490e-02, 4.448e-01, -4.482e-02, 1.091e-01, 4.105e-01, -1.081e-01, -2.144e-02, -1.531e-02, 7.835e-02, -3.865e-02, -1.018e-01), r2);
	r0 = MulAdd(s0_1_0, M4(-4.130e-01, -1.357e-01, 2.685e-02, -2.348e-01, -1.936e-02, 1.505e-02, -1.392e-01, -2.457e-02, -1.300e-02, -1.012e-01, -1.854e-01, -5.578e-01, -1.498e-01, -1.423e-01, -8.878e-03, -3.682e-01), r0);
	r1 = MulAdd(s0_1_0, M4(6.660e-01, 7.191e-02, 6.120e-01, -4.368e-01, 5.277e-01, 1.327e-01, -1.072e-01, -9.239e-02, 5.542e-01, -7.792e-01, -2.208e-01, -1.730e-01, 4.048e-01, -4.107e-01, -2.397e-01, 3.588e-01), r1);
	r2 = MulAdd(s0_1_0, M4(-4.015e-01, -5.224e-01, 4.182e-01, -4.482e-01, 1.562e-01, -4.360e-01, 1.370e-01, -3.616e-02, -4.618e-02, 1.868e-01, 8.858e-02, -8.757e-02, -1.432e-01, 1.780e-01, 2.062e-01, -1.036e-01), r2);
	r0 = MulAdd(s0_1_1, M4(7.276e-02, -1.208e-01, -5.913e-02, -5.598e-01, 2.440e-01, 2.302e-01, 1.070e-01, 2.536e-01, 7.349e-02, -3.152e-01, 2.640e-01, -2.006e-01, -5.437e-01, 1.998e-02, 2.490e-01, -1.816e-01), r0);
	r1 = MulAdd(s0_1_1, M4(7.291e-01, -7.681e-02, -1.505e-01, -7.358e-02, -2.790e-01, 3.648e-01, 1.184e-01, 1.451e+00, 2.265e-01, 3.704e-01, 2.199e-01, -7.690e-02, -2.274e+00, 6.125e-01, 2.507e-01, -2.036e-01), r1);
	r2 = MulAdd(s0_1_1, M4(-6.974e-01, -9.483e-01, -1.005e+00, -7.180e-01, 9.718e-01, 9.220e-01, 1.815e-01, 7.114e-01, -5.811e-01, -4.000e-01, 6.824e-02, -2.309e-01, 5.481e-02, 1.076e-01, -8.109e-02, 8.214e-02), r2);
	r0 = MulAdd(s0_1_2, M4(-3.908e-03, 3.818e-01, 7.491e-02, -2.036e-01, -4.699e-01, -2.655e-01, 5.943e-02, 8.274e-02, -9.263e-02, 5.145e-02, -6.481e-02, -8.418e-01, 2.354e-01, 3.803e-01, 8.590e-02, 6.857e-02), r0);
	r1 = MulAdd(s0_1_2, M4(2.391e-01, -8.973e-02, -3.659e-01, 3.401e-01, -3.881e-01, 5.709e-02, -5.577e-02, 4.037e-01, 7.414e-01, -1.485e-01, -1.317e-01, -1.475e-01, -7.282e-01, -1.551e-02, 1.411e-01, -4.009e-01), r1);
	r2 = MulAdd(s0_1_2, M4(-2.483e-01, -4.494e-01, 8.249e-01, -1.202e-01, -8.706e-02, -1.870e-01, -2.945e-01, -9.375e-02, -5.850e-02, 1.752e-01, -6.868e-02, 1.178e-01, -9.945e-02, 8.942e-02, 1.186e-01, -8.432e-02), r2);
	r0 = MulAdd(s0_2_0, M4(1.203e-02, -1.769e-01, 3.631e-02, 1.997e-01, 4.286e-02, -7.483e-02, -6.190e-02, -9.006e-02, -7.242e-02, 5.012e-03, -1.616e-02, 3.364e-01, 9.702e-02, 2.644e-02, 4.662e-02, 1.905e-01), r0);
	r1 = MulAdd(s0_2_0, M4(1.757e-01, -2.754e-02, -1.902e-01, 7.075e-02, -2.360e-01, -6.136e-02, -1.723e-01, -4.440e-01, 7.379e-01, -9.306e-02, -1.066e-01, 2.348e-01, 3.588e-01, 2.486e-01, 7.216e-03, 8.569e-02), r1);
	r2 = MulAdd(s0_2_0, M4(2.279e-01, 6.121e-01, 3.666e-01, 2.546e-01, 1.741e-02, 1.978e-01, 8.353e-02, -2.692e-03, 1.146e-02, -2.822e-01, 1.836e-04, -1.034e-02, 6.014e-03, 1.530e-01, 9.490e-02, 9.806e-02), r2);
	r0 = MulAdd(s0_2_1, M4(2.685e-01, -3.993e-02, 1.243e-01, 1.526e-01, -2.746e-02, 5.692e-03, -4.628e-02, -1.644e-01, -6.031e-02, 1.331e-01, 4.747e-02, 6.986e-01, -5.089e-02, -8.952e-02, -8.912e-03, 6.074e-01), r0);
	r1 = MulAdd(s0_2_1, M4(2.769e-02, 4.781e-02, -2.540e-02, 9.864e-02, -6.560e-01, -4.386e-02, -1.233e-01, -5.068e-01, -7.804e-01, 7.293e-02, 1.295e-01, 9.702e-02, 1.280e+00, 1.412e-01, 1.408e-02, -9.061e-02), r1);
	r2 = MulAdd(s0_2_1, M4(6.217e-01, 5.693e-01, -4.500e-01, 8.445e-01, -6.107e-02, -2.220e-01, -2.737e-01, -1.397e-01, 2.242e-01, -1.369e-01, -5.505e-02, 6.678e-02, 1.838e-01, -7.097e-02, 1.136e-01, 1.761e-01), r2);
	r0 = MulAdd(s0_2_2, M4(-2.326e-01, 1.049e-01, -6.058e-02, -2.120e-01, -5.192e-02, -5.077e-02, -7.789e-03, 3.571e-02, -1.458e-01, -8.850e-02, 7.475e-02, 4.803e-01, 1.460e-01, -2.452e-02, 1.272e-02, 7.251e-02), r0);
	r1 = MulAdd(s0_2_2, M4(-2.952e-01, 1.587e-01, 1.255e-01, -3.417e-01, -1.724e-01, -4.613e-02, 6.487e-02, -4.664e-01, 3.921e-01, -5.472e-02, 2.621e-02, 5.369e-01, 3.998e-01, -1.093e-01, -1.931e-02, 3.767e-02), r1);
	r2 = MulAdd(s0_2_2, M4(2.300e-01, -6.538e-03, 1.104e-01, 2.158e-01, -1.142e-01, -4.974e-02, -6.693e-02, -8.683e-02, -9.128e-02, -5.455e-01, -8.184e-02, -3.169e-02, -7.272e-02, 2.355e-02, -2.067e-01, -4.727e-02), r2);
	r0 = max(r0, 0.0);
	T3[gxy] = r0;
	r1 = max(r1, 0.0);
	T4[gxy] = r1;
	r2 = max(r2, 0.0);
	T5[gxy] = r2;
}

//!PASS 3
//!DESC conv2 (12x8)
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN T3, T4, T5
//!OUT T0, T1

#define L0(x, y) V4(O(T3, x, y))
#define L1(x, y) V4(O(T4, x, y))
#define L2(x, y) V4(O(T5, x, y))

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 sz = GetInputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = (gxy + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0, r1 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(2.387e-02, 5.039e-03, 1.100e-02, -1.362e-02, 9.302e-02, 8.205e-02, 2.296e-02, 7.425e-03, -2.191e-02, -1.911e-02, 2.297e-04, 9.503e-04, -2.780e-02, 2.108e-02, 1.064e-02, -1.065e-02), r0);
	r1 = MulAdd(s0_0_0, M4(2.802e-02, -9.735e-03, -7.902e-03, -3.066e-02, -1.309e-02, 6.763e-02, 2.354e-03, -1.702e-02, -5.457e-02, -1.194e-01, 2.090e-02, 2.264e-02, 3.222e-02, -3.080e-02, 1.552e-02, 1.208e-01), r1);
	r0 = MulAdd(s0_0_1, M4(1.401e-02, -4.661e-03, 6.371e-02, -1.231e-02, 1.398e-01, 5.913e-02, -1.489e-01, 2.703e-02, 2.063e-03, -4.034e-02, 5.600e-03, -1.538e-01, -2.203e-02, 2.497e-02, -1.312e-01, 6.019e-02), r0);
	r1 = MulAdd(s0_0_1, M4(5.172e-03, 6.714e-02, 7.182e-02, 1.414e-01, 9.651e-02, 5.924e-02, 5.372e-03, 3.350e-01, 4.913e-02, 1.126e-01, -8.533e-02, -5.515e-02, -4.498e-02, -4.091e-02, -1.401e-01, -9.632e-02), r1);
	r0 = MulAdd(s0_0_2, M4(4.700e-02, 3.259e-02, 5.663e-02, 3.233e-02, 3.070e-02, 2.611e-02, -1.845e-02, 1.003e-02, -3.212e-03, -8.722e-03, 2.501e-03, 3.606e-02, -3.405e-02, -1.799e-02, -6.237e-02, 9.660e-03), r0);
	r1 = MulAdd(s0_0_2, M4(5.018e-02, 8.628e-03, 1.489e-01, -8.424e-02, 1.169e-01, -4.234e-03, 5.637e-02, -7.539e-02, -9.058e-02, -1.327e-02, 8.276e-02, 1.966e-02, 3.427e-02, -1.187e-02, -1.087e-01, -8.417e-02), r1);
	r0 = MulAdd(s0_1_0, M4(-1.191e-01, 1.294e-01, 1.445e-02, 2.066e-02, -4.614e-02, -2.934e-02, -3.672e-02, -1.284e-02, 5.660e-01, 2.852e-01, 2.228e-01, 1.082e-01, 2.247e-02, -7.533e-03, -2.905e-02, -6.454e-04), r0);
	r1 = MulAdd(s0_1_0, M4(8.417e-02, -2.250e-01, -1.563e-02, -2.469e-02, 1.544e-01, 2.183e-01, -1.818e-03, -6.337e-03, -2.744e-01, 9.829e-01, 2.247e-02, 4.035e-02, 1.008e-01, 1.393e-01, 4.202e-03, 1.659e-02), r1);
	r0 = MulAdd(s0_1_1, M4(-6.695e-02, 1.528e-01, 5.858e-02, 6.154e-01, 5.879e-01, 6.793e-01, 8.536e-01, 4.064e-01, 3.909e-01, 4.033e-01, -7.441e-01, 6.816e-01, 2.666e-01, -2.377e-01, 1.375e-02, -9.337e-02), r0);
	r1 = MulAdd(s0_1_1, M4(-7.669e-02, 6.810e-02, -1.178e+00, 8.876e-02, 4.678e-01, 8.091e-02, 2.549e-01, 4.048e-01, 6.447e-01, -3.797e-01, 6.796e-01, -1.002e-01, -1.359e-01, 1.556e-01, 4.443e-01, 5.872e-02), r1);
	r0 = MulAdd(s0_1_2, M4(2.936e-02, 3.309e-02, -4.649e-02, 1.266e-03, -1.071e-02, 1.150e-02, -4.307e-02, 8.376e-02, -8.812e-02, 2.457e-03, 3.471e-01, 6.348e-02, -9.286e-03, -1.159e-02, -1.309e-01, -2.829e-02), r0);
	r1 = MulAdd(s0_1_2, M4(8.603e-02, -1.563e-02, -4.872e-02, -8.130e-02, 1.372e-01, 6.076e-04, 4.155e-02, 4.680e-02, -3.156e-01, 3.210e-02, -3.230e-01, 1.127e-01, -2.228e-02, 1.770e-02, 3.213e-01, -1.441e-01), r1);
	r0 = MulAdd(s0_2_0, M4(-1.035e-01, -1.453e-01, 2.119e-02, -4.831e-02, 2.838e-02, 3.192e-02, -2.990e-03, 1.083e-02, -4.665e-02, 2.126e-01, 2.998e-01, 5.725e-02, -2.358e-02, 2.070e-02, 9.487e-03, 2.215e-02), r0);
	r1 = MulAdd(s0_2_0, M4(2.265e-01, 5.212e-02, -2.732e-02, -2.056e-01, -1.269e-01, -9.936e-02, 8.049e-04, -2.832e-02, -3.935e-01, -6.899e-02, -8.451e-03, -8.612e-02, 4.968e-02, -7.789e-02, -1.894e-02, -9.563e-03), r1);
	r0 = MulAdd(s0_2_1, M4(-4.269e-02, -2.241e-01, -4.941e-01, -1.515e-01, 1.681e-02, 8.103e-02, 1.392e-02, -2.671e-02, -9.747e-02, -3.161e-02, -6.227e-03, 1.997e-01, -3.018e-01, 1.713e-01, 6.287e-02, 1.076e-01), r0);
	r1 = MulAdd(s0_2_1, M4(-4.755e-01, -1.019e-01, 5.387e-02, 1.452e-01, 1.255e-01, 6.241e-04, -9.066e-02, 4.872e-02, 3.606e-01, -4.531e-02, 3.632e-02, -2.959e-01, 1.296e-02, -2.184e-02, 4.305e-04, -2.642e-02), r1);
	r0 = MulAdd(s0_2_2, M4(-1.507e-02, 9.670e-03, -7.726e-02, -7.397e-02, 1.959e-02, 1.723e-02, -9.752e-03, -2.326e-02, -2.236e-02, -6.484e-02, -4.524e-02, -5.987e-02, 2.390e-02, 6.645e-02, 5.563e-02, 1.685e-02), r0);
	r1 = MulAdd(s0_2_2, M4(1.048e-01, 2.912e-02, -1.161e-01, 1.022e-01, 5.820e-02, -8.682e-04, 2.609e-02, -6.270e-03, -2.588e-01, -1.494e-02, -4.232e-02, -3.747e-02, -9.839e-02, -8.867e-03, 7.103e-02, 6.650e-02), r1);
	r0 = MulAdd(s1_0_0, M4(-1.487e-01, -3.279e-01, -2.672e-02, -8.456e-02, 7.692e-03, -1.460e-02, 3.830e-04, -1.309e-02, -1.023e-01, -2.793e-02, -3.870e-02, -1.387e-03, 2.262e-01, 6.213e-02, 8.477e-02, 5.583e-02), r0);
	r1 = MulAdd(s1_0_0, M4(1.209e-01, -5.293e-01, 1.742e-02, 1.232e-01, -2.523e-02, 3.848e-02, 1.010e-02, -4.849e-02, 3.161e-02, -4.788e-02, -3.296e-02, 6.662e-02, -1.255e-01, 2.300e-01, -3.164e-02, 1.294e-01), r1);
	r0 = MulAdd(s1_0_1, M4(-1.350e-01, 1.637e-01, -1.179e-01, -1.987e-01, 5.722e-02, 7.147e-03, 3.106e-02, 2.949e-02, -1.528e-01, -8.862e-02, -1.209e-01, -1.719e-02, -1.948e-01, -1.005e-01, 1.262e-03, -3.722e-02), r0);
	r1 = MulAdd(s1_0_1, M4(-2.300e-01, -1.910e-01, -1.522e-01, -1.348e+00, -4.016e-02, 2.514e-02, -8.788e-03, -2.277e-01, 5.975e-02, -7.251e-02, -2.105e-01, 2.540e-01, 1.489e-01, -2.735e-01, 2.353e-01, -3.681e-01), r1);
	r0 = MulAdd(s1_0_2, M4(-1.500e-01, -1.010e-01, -1.774e-01, -1.191e-01, -2.646e-02, -2.101e-02, 2.656e-03, -1.485e-02, -3.013e-02, 3.431e-02, 3.875e-02, -3.613e-02, -5.432e-02, -2.659e-02, -6.672e-02, -3.826e-02), r0);
	r1 = MulAdd(s1_0_2, M4(-2.183e-01, -2.350e-03, -2.978e-01, -1.457e-01, -7.195e-03, -1.510e-02, 5.454e-02, -2.988e-02, 1.451e-01, 3.372e-03, 2.713e-03, -1.074e-01, -1.362e-01, 2.364e-02, -2.819e-01, 1.518e-01), r1);
	r0 = MulAdd(s1_1_0, M4(1.042e-01, 2.220e-01, 5.613e-02, 1.645e-01, -1.062e-01, -3.348e-02, 5.082e-02, -1.945e-02, -1.909e-01, -1.220e-01, -1.552e-01, -7.162e-02, -1.783e-01, 6.354e-02, 4.846e-02, 2.436e-02), r0);
	r1 = MulAdd(s1_1_0, M4(-1.876e-01, 2.569e-01, -8.313e-02, -8.131e-02, -3.810e-02, -4.132e-01, -2.325e-02, 3.355e-02, 2.210e-01, -3.486e-01, 4.346e-03, 1.124e-01, -1.194e-01, -2.803e-01, 3.860e-02, -1.494e-03), r1);
	r0 = MulAdd(s1_1_1, M4(2.000e-01, -6.499e-01, 2.163e-01, -1.899e-01, -2.588e-01, -1.604e-01, -8.083e-02, -2.531e-01, 2.904e-01, 3.430e-01, 6.142e-02, -3.122e-01, -1.360e-01, -2.939e-01, -3.565e-01, -5.947e-02), r0);
	r1 = MulAdd(s1_1_1, M4(6.801e-02, 1.179e-01, -5.623e-02, -6.493e-02, -4.985e-01, -1.634e-02, -3.174e-01, -4.263e-01, -1.463e-01, 1.955e-01, -2.140e-01, 8.439e-01, 3.290e-02, 4.419e-02, -2.152e-01, -1.529e-01), r1);
	r0 = MulAdd(s1_1_2, M4(-1.757e-02, -2.921e-01, -1.383e-02, 8.974e-02, -1.238e-02, -1.996e-02, -7.740e-02, -2.862e-02, -1.169e-01, -1.899e-01, 2.023e-01, 2.529e-01, 3.576e-02, -1.853e-02, -5.013e-03, -9.529e-02), r0);
	r1 = MulAdd(s1_1_2, M4(-2.812e-01, 8.243e-03, 2.446e-01, -4.952e-02, -1.401e-01, -5.836e-03, -3.480e-02, -3.135e-02, 6.191e-01, -3.551e-02, 2.819e-02, -1.803e-01, -2.387e-01, 1.252e-02, 1.372e-01, -6.929e-02), r1);
	r0 = MulAdd(s1_2_0, M4(-6.778e-03, -8.322e-02, -6.105e-03, -2.485e-02, -1.710e-02, -8.667e-02, -9.260e-03, -2.666e-03, -1.631e-02, -1.091e-01, -1.499e-01, -1.142e-03, 1.879e-02, -8.178e-02, 2.972e-03, -4.369e-02), r0);
	r1 = MulAdd(s1_2_0, M4(3.702e-02, -1.485e-01, -5.488e-03, 5.798e-02, 1.759e-02, 2.316e-02, 2.227e-02, -3.748e-02, 1.924e-01, -2.009e-02, 1.210e-02, 6.553e-02, 8.576e-02, 5.872e-02, -1.460e-02, -3.207e-02), r1);
	r0 = MulAdd(s1_2_1, M4(-2.267e-01, 7.790e-02, -1.837e-02, -7.647e-02, 3.443e-03, -6.516e-02, -1.018e-01, -2.413e-02, -7.503e-02, -9.552e-02, -4.063e-02, -6.569e-02, 2.606e-02, 5.670e-02, 7.544e-02, -2.148e-02), r0);
	r1 = MulAdd(s1_2_1, M4(1.096e-02, -7.206e-02, -7.949e-02, 4.133e-02, -1.003e-01, -2.908e-02, 5.395e-03, -3.755e-02, 5.544e-03, 3.053e-02, -2.249e-02, 1.112e-01, -6.042e-02, 1.486e-02, 3.790e-02, 5.070e-02), r1);
	r0 = MulAdd(s1_2_2, M4(-5.846e-02, 9.742e-02, -7.688e-02, 1.446e-01, -2.631e-02, -2.632e-02, -8.949e-03, -1.008e-02, -5.947e-03, -3.663e-03, -3.306e-03, -3.730e-02, -1.279e-02, 1.553e-03, 3.512e-03, 3.078e-02), r0);
	r1 = MulAdd(s1_2_2, M4(-4.995e-02, -3.162e-02, -1.268e-01, -2.651e-02, -2.819e-02, 2.515e-03, -3.405e-02, 1.166e-02, 2.245e-01, 2.107e-02, 1.057e-01, 9.793e-03, -4.176e-02, -1.143e-03, 1.737e-03, -2.745e-02), r1);
	s0_0_0 = L2(-1.0, -1.0); s0_0_1 = L2(0.0, -1.0); s0_0_2 = L2(1.0, -1.0);
	s0_1_0 = L2(-1.0, 0.0); s0_1_1 = L2(0.0, 0.0); s0_1_2 = L2(1.0, 0.0);
	s0_2_0 = L2(-1.0, 1.0); s0_2_1 = L2(0.0, 1.0); s0_2_2 = L2(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(-1.196e-01, -4.586e-02, -1.041e-02, -7.075e-02, -1.043e-01, -1.993e-01, 4.699e-03, 2.184e-02, -5.846e-02, -1.485e-02, 3.524e-02, 1.643e-02, 9.159e-02, 1.665e-03, -1.411e-01, 3.330e-02), r0);
	r1 = MulAdd(s0_0_0, M4(2.904e-01, -9.613e-02, -1.256e-03, -7.892e-02, -1.265e-02, 2.645e-03, -1.015e-01, -2.250e-01, -2.015e-02, -1.767e-01, -1.145e-01, 1.611e-01, -1.394e-01, 9.167e-02, -1.368e-02, -2.010e-01), r1);
	r0 = MulAdd(s0_0_1, M4(-7.302e-03, -1.195e-01, 1.192e-01, -2.582e-01, 2.065e-01, -4.437e-01, 1.013e-01, -1.743e-01, -7.795e-03, -7.220e-03, -4.291e-02, -9.929e-02, 1.154e-02, 9.751e-03, 4.229e-01, 2.901e-01), r0);
	r1 = MulAdd(s0_0_1, M4(-3.311e-01, 3.008e-02, -1.935e-02, -1.906e-01, -2.607e-01, 1.470e-01, 2.939e-01, -1.605e-01, -2.637e-02, 4.003e-03, -2.290e-01, -1.696e-01, 9.910e-02, -2.000e-02, 4.756e-01, 5.140e-01), r1);
	r0 = MulAdd(s0_0_2, M4(1.236e-02, 1.774e-02, 1.694e-01, 1.014e-01, -3.561e-02, -3.528e-02, -1.364e-02, 1.269e-01, -1.765e-02, -2.019e-02, -9.837e-03, -1.400e-02, -5.214e-03, -2.716e-02, -2.091e-01, -1.499e-01), r0);
	r1 = MulAdd(s0_0_2, M4(-2.367e-02, 2.824e-02, 2.510e-02, 2.237e-01, -1.304e-01, 5.437e-02, -4.635e-02, -5.073e-02, -5.724e-04, -4.104e-03, -1.058e-02, 4.223e-04, -1.335e-01, -4.954e-02, 8.899e-02, -3.740e-01), r1);
	r0 = MulAdd(s0_1_0, M4(-3.056e-01, -4.099e-01, -1.091e-01, -1.174e-01, -1.373e-01, -4.709e-02, -1.483e-02, -1.188e-02, -1.693e-02, 5.456e-02, 3.565e-02, 3.283e-02, 4.658e-01, 4.776e-01, -3.195e-02, 1.139e-01), r0);
	r1 = MulAdd(s0_1_0, M4(5.813e-01, -6.348e-01, 5.341e-02, 1.283e-01, -2.898e-01, -5.605e-01, -7.739e-02, 4.109e-02, -4.842e-01, -2.015e-01, 3.188e-02, 2.494e-01, -4.639e-01, 9.178e-01, -7.965e-02, -1.977e-01), r1);
	r0 = MulAdd(s0_1_1, M4(-4.475e-01, -2.325e-01, -2.114e-01, -3.678e-01, -4.110e-01, -5.558e-02, -4.194e-01, -1.646e-01, -1.812e-01, -1.165e-01, -2.180e-01, -4.219e-01, 6.516e-01, 4.901e-01, 7.598e-01, 2.872e-01), r0);
	r1 = MulAdd(s0_1_1, M4(-6.779e-01, -2.329e-01, -3.032e-01, -9.753e-01, 2.636e-02, 1.457e-02, -1.628e-01, -2.202e-01, 2.485e-01, -1.431e-01, -3.447e-01, -7.063e-01, 8.067e-01, 3.149e-01, 3.533e-01, 1.168e+00), r1);
	r0 = MulAdd(s0_1_2, M4(-8.960e-02, -1.548e-01, -2.743e-02, -1.452e-01, -3.411e-02, -7.984e-02, -1.720e-01, -1.372e-01, -2.922e-02, -4.869e-02, 6.113e-02, -6.427e-02, 1.180e-01, 1.630e-01, 1.424e-01, 2.843e-01), r0);
	r1 = MulAdd(s0_1_2, M4(-6.987e-01, 2.133e-03, -4.074e-02, 9.316e-02, -1.117e-01, -2.403e-02, 3.816e-02, -1.217e-02, -1.752e-01, -1.545e-03, -1.333e-01, -6.443e-02, 5.456e-01, 7.471e-03, -7.507e-03, -2.339e-01), r1);
	r0 = MulAdd(s0_2_0, M4(5.189e-02, -3.084e-02, 6.678e-02, -1.339e-02, 5.211e-02, -4.927e-02, -1.239e-02, -1.325e-02, -1.362e-02, -5.128e-02, 1.129e-02, 4.093e-02, -4.678e-02, 9.366e-02, -6.714e-02, 1.592e-02), r0);
	r1 = MulAdd(s0_2_0, M4(1.821e-01, 1.900e-01, -4.043e-02, -2.715e-02, 7.738e-02, 1.067e-01, -2.444e-02, 3.594e-02, -8.610e-02, -1.304e-01, 1.970e-02, 4.296e-03, -2.328e-01, -2.435e-01, 3.431e-02, 5.506e-02), r1);
	r0 = MulAdd(s0_2_1, M4(1.392e-01, -4.945e-02, 2.119e-01, -7.351e-02, 4.283e-02, 2.137e-02, 1.687e-01, -1.623e-02, 1.400e-02, -7.672e-03, -1.617e-01, -4.320e-02, -1.651e-01, 2.251e-02, -2.378e-01, 7.097e-02), r0);
	r1 = MulAdd(s0_2_1, M4(-2.090e-01, 3.536e-02, -3.582e-03, 2.963e-03, -6.411e-02, -9.594e-03, 1.891e-02, 3.416e-02, 1.578e-02, -2.630e-02, 8.782e-04, 2.474e-02, 2.531e-01, -2.681e-02, -2.825e-02, -3.559e-02), r1);
	r0 = MulAdd(s0_2_2, M4(6.498e-02, 6.811e-02, 1.636e-01, 5.577e-02, -1.441e-02, 7.929e-03, 3.934e-02, 3.743e-04, -2.435e-02, -6.520e-03, 1.730e-02, 8.862e-03, -6.469e-02, -7.798e-02, -2.087e-01, -6.567e-02), r0);
	r1 = MulAdd(s0_2_2, M4(-4.256e-02, 2.814e-02, -8.059e-02, 4.228e-02, 4.183e-02, 3.332e-03, -8.306e-02, -5.689e-02, -2.430e-02, -6.836e-03, -2.929e-02, 1.817e-02, 3.010e-02, -3.360e-02, 1.072e-01, -2.368e-02), r1);
	r0 = max(r0, 0.0);
	T0[gxy] = r0;
	r1 = max(r1, 0.0);
	T1[gxy] = r1;
}

//!PASS 4
//!DESC out-shuffle (8x4)
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, T0, T1
//!OUT OUTPUT

#define L0(x, y) V4(O(T0, x, y))
#define L1(x, y) V4(O(T1, x, y))

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 sz = GetOutputSize();
	if (gxy.x >= sz.x || gxy.y >= sz.y)
		return;
	float2 pos = ((gxy >> 1) + 0.5) * pt;
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0 = 0.0;
	s0_0_0 = L0(-1.0, -1.0); s0_0_1 = L0(0.0, -1.0); s0_0_2 = L0(1.0, -1.0);
	s0_1_0 = L0(-1.0, 0.0); s0_1_1 = L0(0.0, 0.0); s0_1_2 = L0(1.0, 0.0);
	s0_2_0 = L0(-1.0, 1.0); s0_2_1 = L0(0.0, 1.0); s0_2_2 = L0(1.0, 1.0);
	s1_0_0 = L1(-1.0, -1.0); s1_0_1 = L1(0.0, -1.0); s1_0_2 = L1(1.0, -1.0);
	s1_1_0 = L1(-1.0, 0.0); s1_1_1 = L1(0.0, 0.0); s1_1_2 = L1(1.0, 0.0);
	s1_2_0 = L1(-1.0, 1.0); s1_2_1 = L1(0.0, 1.0); s1_2_2 = L1(1.0, 1.0);
	r0 = MulAdd(s0_0_0, M4(2.292e-02, 1.001e-02, -8.934e-03, 1.739e-03, -1.075e-02, 1.342e-03, 7.415e-03, 4.035e-03, 6.036e-02, -7.155e-03, -1.980e-03, -2.166e-03, -2.081e-02, 7.952e-03, -1.267e-02, 1.142e-02), r0);
	r0 = MulAdd(s0_0_1, M4(7.132e-02, 2.416e-02, -2.227e-03, -4.860e-04, 5.798e-03, -1.587e-02, 2.928e-02, -4.944e-02, 4.748e-02, 1.255e-01, -1.654e-02, 7.912e-03, -7.690e-02, -8.911e-02, -5.182e-04, 1.592e-02), r0);
	r0 = MulAdd(s0_0_2, M4(1.837e-02, 1.590e-02, -5.851e-04, -1.591e-02, 2.044e-03, -3.361e-02, 1.057e-02, 1.349e-02, 1.272e-02, -6.750e-03, 7.907e-03, -1.285e-02, -1.592e-02, 9.610e-04, -9.432e-03, 8.813e-03), r0);
	r0 = MulAdd(s0_1_0, M4(8.667e-02, -9.242e-03, 3.318e-02, 1.916e-02, 6.151e-02, 2.082e-02, 3.235e-02, -2.143e-02, 5.009e-02, -1.719e-02, 1.851e-01, -1.685e-02, 2.506e-02, -3.894e-03, 1.690e-03, 2.377e-02), r0);
	r0 = MulAdd(s0_1_1, M4(-8.387e-01, -1.761e-01, 3.328e-01, 5.308e-02, 2.998e-01, 4.975e-02, -7.793e-01, -1.165e-01, -5.922e-02, 1.125e-01, 9.888e-02, 4.112e-01, 2.311e-01, 3.285e-01, 1.176e-02, -2.353e-01), r0);
	r0 = MulAdd(s0_1_2, M4(-3.505e-02, -1.204e-01, -3.650e-02, 3.279e-02, 2.166e-02, 1.921e-02, 4.179e-02, -1.014e-01, 1.004e-02, -3.576e-02, 1.460e-02, -1.919e-02, -8.401e-03, 5.749e-02, -1.101e-02, 4.334e-02), r0);
	r0 = MulAdd(s0_2_0, M4(1.645e-02, 5.136e-04, 1.847e-02, 1.850e-02, -5.264e-03, 1.014e-03, 1.631e-02, 7.070e-03, -1.447e-03, -3.175e-03, -3.604e-02, -1.518e-02, 1.477e-05, -1.200e-03, 5.377e-02, -3.313e-03), r0);
	r0 = MulAdd(s0_2_1, M4(3.565e-02, 5.328e-03, 4.944e-02, -7.376e-03, -2.322e-03, 6.952e-04, 2.526e-02, -4.164e-03, -5.473e-03, 3.168e-03, -8.765e-02, -6.798e-02, -4.066e-03, -7.083e-03, 1.200e-01, 1.899e-01), r0);
	r0 = MulAdd(s0_2_2, M4(1.376e-02, 5.270e-03, 2.451e-02, -3.066e-02, -1.031e-03, -8.212e-04, -1.497e-02, 1.617e-04, -6.968e-03, 2.039e-04, -6.910e-03, -1.776e-02, 2.744e-03, 5.319e-04, 8.148e-03, 1.788e-02), r0);
	r0 = MulAdd(s1_0_0, M4(-1.572e-02, 8.821e-03, -1.483e-02, 2.154e-02, -9.811e-03, -2.851e-03, 2.289e-03, -3.321e-03, -4.134e-02, -7.724e-03, 4.346e-03, -9.430e-04, -4.933e-03, 3.370e-03, -5.796e-03, 8.162e-03), r0);
	r0 = MulAdd(s1_0_1, M4(5.311e-02, -1.421e-01, 1.289e-02, 3.699e-02, 2.606e-02, -7.378e-03, 5.391e-03, -6.596e-03, -3.749e-02, -7.154e-02, -1.086e-03, 6.008e-03, 2.401e-02, -1.457e-02, -2.020e-02, 6.355e-03), r0);
	r0 = MulAdd(s1_0_2, M4(-2.142e-02, 4.039e-02, -1.041e-02, 9.822e-03, 3.846e-02, 1.240e-01, 3.890e-03, -1.742e-03, -2.683e-03, 7.338e-03, -4.240e-04, 4.985e-03, -6.563e-03, 2.869e-03, 4.070e-03, 4.858e-03), r0);
	r0 = MulAdd(s1_1_0, M4(9.836e-02, 2.372e-03, 1.294e-01, -3.218e-03, -6.737e-02, -7.906e-03, -1.745e-02, -6.910e-03, 1.958e-01, 3.296e-02, -1.284e-01, -3.370e-02, -1.589e-02, -3.011e-02, -3.112e-03, -1.329e-02), r0);
	r0 = MulAdd(s1_1_1, M4(3.337e-01, -3.467e-01, 3.918e-01, -5.527e-01, -1.231e-01, 6.421e-02, 1.877e-02, 4.993e-02, 1.997e-01, 3.447e-01, -8.720e-02, -1.753e-01, -5.701e-01, 4.404e-01, -9.540e-02, 1.248e-01), r0);
	r0 = MulAdd(s1_1_2, M4(-1.848e-02, 7.104e-02, -3.674e-02, 7.264e-02, 3.116e-02, -8.314e-01, 1.517e-01, 3.116e-01, 2.196e-03, 3.479e-02, 3.625e-03, -7.317e-03, -4.041e-02, 9.449e-02, -3.724e-02, 1.484e-02), r0);
	r0 = MulAdd(s1_2_0, M4(-2.451e-02, 4.403e-03, -5.063e-03, 4.535e-03, -1.594e-02, -2.546e-03, -3.374e-02, -1.276e-02, -5.974e-02, -8.156e-03, -8.276e-02, 6.096e-02, -9.513e-03, 9.127e-04, -8.361e-03, -2.912e-02), r0);
	r0 = MulAdd(s1_2_1, M4(-7.426e-03, 4.917e-03, 1.844e-02, -1.523e-02, -2.838e-02, -5.470e-03, 2.448e-02, -2.177e-02, -4.583e-02, -2.253e-02, 7.540e-02, -3.926e-01, -1.423e-02, 2.884e-02, -4.344e-01, 3.271e-01), r0);
	r0 = MulAdd(s1_2_2, M4(3.116e-03, -2.670e-03, 2.760e-03, 2.948e-02, -3.235e-02, 1.630e-02, -2.941e-02, 1.160e-01, -1.131e-02, -9.327e-03, -2.012e-02, 4.236e-02, -5.858e-03, -1.082e-02, -2.096e-02, 9.844e-02), r0);
	static const MF3x3 RY = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081}, YR = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
	float2 opt = float2(GetOutputPt()), fpos = (float2(gxy) + 0.5) * opt;
	MF3 yuv;
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(0.0, 0.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(0, 0)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.x), yuv.yz)), 1.0);
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(1.0, 0.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(1, 0)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.y), yuv.yz)), 1.0);
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(0.0, 1.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(0, 1)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.z), yuv.yz)), 1.0);
	yuv = mul(RY, INPUT.SampleLevel(SL, fpos + float2(1.0, 1.0) * opt, 0).rgb);
	OUTPUT[gxy + int2(1, 1)] = MF4(mul(YR, MF3(saturate(yuv.r + r0.w), yuv.yz)), 1.0);
}
